Preparation
# install stuff that we need later
if (!require("DT")) install.packages('DT')
if (!require("ggplot2")) install.packages('ggplot2')
if (!require("tidyverse")) install.packages('tidyverse')
if (!require("hrbrthemes")) install.packages('hrbrthemes')
if (!require("dplyr")) install.packages('dplyr')
# Load stuff we need later
library(readr)
library(DT)
library(ggplot2)
library(tidyverse)
library(hrbrthemes)
library(dplyr)
library(scales)
# and set the working directory
setwd("~/projects/bbs-for-independence/03_workspace")
Import Data
# Read dataset summary from csv
dataset <- read_csv("./models/dataset.csv", show_col_types = FALSE)
dataset$charratioDelta = dataset$charratioB - dataset$charratioA
Prepare Data
# Check the average of length, length_raw, avgcolumnsize, charratioA and charratioB
df = aggregate(x = dataset[, c(4,5,6,7,8,13)],
by = list(dataset$category),
FUN = function(x) list(
mean = round(mean(suppressWarnings(as.numeric(as.character(x))), na.rm=TRUE), digits = 2),
n = length(x)))
df <- do.call(data.frame, df) # bind columns which contain matrices back into the data frame
df <- as.data.frame(lapply(df, unlist)) # convert lists back to vectors
f_selection <- dataset %>% filter(!category %in% c("fidonet-on-the-internet", "tap", "floppies",
"exhibits", "artifacts", "piracy", "art",
"magazines", "digest"))
f_magazins <- dataset %>% filter(category == "magazines")
f_digest <- dataset %>% filter(category == "digest")
fun_charratioB_selection <- Vectorize( function(x) { nrow(f_selection %>% filter(charratioB > x)) } )
fun_charratioB_magazins <- Vectorize( function(x) { nrow(f_magazins %>% filter(charratioB > x)) } )
fun_charratioB_digest <- Vectorize( function(x) { nrow(f_digest %>% filter(charratioB > x)) } )
data_fun <- data.frame(x = seq(0,1,0.01),
n = c(fun_charratioB_selection(seq(0,1,0.01)),
fun_charratioB_magazins(seq(0,1,0.01)),
fun_charratioB_digest(seq(0,1,0.01))),
categories = rep(c("selection", "magazins", "digest"), each = 101))
Summarize Data
cat("Anzahl Dateien: ", nrow(dataset))
## Anzahl Dateien: 105470
cat("Anzahl Kategorien: ", nrow(df))
## Anzahl Kategorien: 48
cat("Anzahl Dateien bei Filterung von tap, art, floppies, piracy, exhibits, magazines, digest:", nrow(f_selection))
## Anzahl Dateien bei Filterung von tap, art, floppies, piracy, exhibits, magazines, digest: 44833
cat("Anzahl Dateien bei zusätzlicher Filterung von charratioB >0.95:", fun_charratioB_selection(0.95))
## Anzahl Dateien bei zusätzlicher Filterung von charratioB >0.95: 7242
# draw curve for fun_charratioB
ggplot(data_fun, aes(x, n, col = categories)) +
geom_line() +
xlim(0.8, 1) +
scale_y_continuous(trans = log10_trans()) +
ggtitle("n files if charratioB > x on a log10 scale")

datatable(df %>%
arrange(desc(charratioB.mean)) %>%
select(Group.1, length.n, length.mean, length_raw.mean, avgcolumnsize.mean,
charratioA.mean, charratioB.mean, charratioDelta.mean),
options = list(
pageLength = 50,
initComplete = JS("function(settings, json) {
$(this.api().table().header()).css({'font-size' : '12px'});
$('table.dataTable thead th').css({'padding' : '10px 18px 10px 0px'});
}")
)
)
Plots
Verhältnis Text (exkl. Satz- und Leerzeichen) zu Dateilänge
dataset %>%
ggplot( aes(x=reorder(category, charratioA, FUN = median),
y=charratioA, group=category)) +
geom_boxplot() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
geom_jitter(color="black", size=0.4, alpha=0.05) +
stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
coord_flip() +
ylim(0, 1) +
xlab("Kategorie") +
ylab("Verhältnis")

Verhältnis Text (inkl. Satz- und Leerzeichen) zu Dateilänge
# create plot: charratioB
dataset %>%
ggplot( aes(x=reorder(category, charratioB, FUN = median),
y=charratioB, group=category)) +
geom_boxplot() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
geom_jitter(color="black", size=0.4, alpha=0.05) +
stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
coord_flip() +
ylim(0, 1) +
xlab("Kategorie") +
ylab("Verhältnis")

Differenz beiden Verhältnissen (inkl. minus exkl. Satz- und Leerzeichen zu Dateilänge)
dataset %>%
ggplot( aes(x=reorder(category, charratioA-charratioB, FUN = median),
y=charratioB-charratioA, group=category)) +
geom_boxplot() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
geom_jitter(color="black", size=0.4, alpha=0.05) +
stat_summary(fun.y=mean, geom="point", shape=4, size=2, color="red", fill="red") +
coord_flip() +
ylim(0, 1) +
xlab("Kategorie") +
ylab("Differenz beiden Verhältnissen")

Apply filtering and extend filtering
data_names_exclude <- c("fidonet-on-the-internet","tap","floppies","exhibits","artifacts","piracy", "art", "magazines", "digest")
dataset_filtered = dataset %>%
filter(!category %in% data_names_exclude) %>%
filter(charratioB > 0.95)
fun_length_selection_lt <- Vectorize( function(x) { nrow(dataset_filtered %>% filter(length < x)) } )
fun_length_selection_gt <- Vectorize( function(x) { nrow(dataset_filtered %>% filter(length > x)) } )
length_fun_seq = seq(min(dataset_filtered$length),max(dataset_filtered$length),1000)
length_fun <- data.frame(x = length_fun_seq,
n = c(fun_length_selection_lt(length_fun_seq),
fun_length_selection_gt(length_fun_seq)),
categories = rep(c("greater than", "less than"), each = length(length_fun_seq)))
ggplot(length_fun, aes(x, n, col = categories)) +
geom_line() +
scale_x_continuous(trans = log10_trans()) +
ggtitle("n files if length < or > x on a log10 scale")

dataset_filtered_2 = dataset %>%
filter(!category %in% data_names_exclude) %>%
filter(charratioB > 0.95) %>%
filter(length > 1000) %>%
filter(length < 10000)
cat("Anzahl Dateien mit gefilterter Länge: ", nrow(dataset_filtered_2))
## Anzahl Dateien mit gefilterter Länge: 2741